{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true,
    "pycharm": {
     "is_executing": false
    }
   },
   "outputs": [],
   "source": [
    "import pathlib\n",
    "\n",
    "import numpy as np\n",
    "import scipy.sparse\n",
    "import scipy.io\n",
    "import pandas as pd\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "import networkx as nx\n",
    "import utils.preprocess\n",
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "outputs": [],
   "source": [
    "save_prefix = 'data/preprocessed/IMDB_processed/'\n",
    "num_ntypes = 3"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "outputs": [],
   "source": [
    "# load raw data, delete movies with no actor or director\n",
    "movies = pd.read_csv('data/raw/IMDB/movie_metadata.csv', encoding='utf-8').dropna(\n",
    "    axis=0, subset=['actor_1_name', 'director_name']).reset_index(drop=True)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "outputs": [],
   "source": [
    "# extract labels, and delete movies with unwanted genres\n",
    "# 0 for action, 1 for comedy, 2 for drama, -1 for others\n",
    "labels = np.zeros((len(movies)), dtype=int)\n",
    "for movie_idx, genres in movies['genres'].iteritems():\n",
    "    labels[movie_idx] = -1\n",
    "    for genre in genres.split('|'):\n",
    "        if genre == 'Action':\n",
    "            labels[movie_idx] = 0\n",
    "            break\n",
    "        elif genre == 'Comedy':\n",
    "            labels[movie_idx] = 1\n",
    "            break\n",
    "        elif genre == 'Drama':\n",
    "            labels[movie_idx] = 2\n",
    "            break\n",
    "unwanted_idx = np.where(labels == -1)[0]\n",
    "movies = movies.drop(unwanted_idx).reset_index(drop=True)\n",
    "labels = np.delete(labels, unwanted_idx, 0)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "outputs": [],
   "source": [
    "# get director list and actor list\n",
    "directors = list(set(movies['director_name'].dropna()))\n",
    "directors.sort()\n",
    "actors = list(set(movies['actor_1_name'].dropna().to_list() +\n",
    "                  movies['actor_2_name'].dropna().to_list() +\n",
    "                  movies['actor_3_name'].dropna().to_list()))\n",
    "actors.sort()"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "outputs": [],
   "source": [
    "# build the adjacency matrix for the graph consisting of movies, directors and actors\n",
    "# 0 for movies, 1 for directors, 2 for actors\n",
    "dim = len(movies) + len(directors) + len(actors)\n",
    "type_mask = np.zeros((dim), dtype=int)\n",
    "type_mask[len(movies):len(movies)+len(directors)] = 1\n",
    "type_mask[len(movies)+len(directors):] = 2\n",
    "\n",
    "adjM = np.zeros((dim, dim), dtype=int)\n",
    "for movie_idx, row in movies.iterrows():\n",
    "    if row['director_name'] in directors:\n",
    "        director_idx = directors.index(row['director_name'])\n",
    "        adjM[movie_idx, len(movies) + director_idx] = 1\n",
    "        adjM[len(movies) + director_idx, movie_idx] = 1\n",
    "    if row['actor_1_name'] in actors:\n",
    "        actor_idx = actors.index(row['actor_1_name'])\n",
    "        adjM[movie_idx, len(movies) + len(directors) + actor_idx] = 1\n",
    "        adjM[len(movies) + len(directors) + actor_idx, movie_idx] = 1\n",
    "    if row['actor_2_name'] in actors:\n",
    "        actor_idx = actors.index(row['actor_2_name'])\n",
    "        adjM[movie_idx, len(movies) + len(directors) + actor_idx] = 1\n",
    "        adjM[len(movies) + len(directors) + actor_idx, movie_idx] = 1\n",
    "    if row['actor_3_name'] in actors:\n",
    "        actor_idx = actors.index(row['actor_3_name'])\n",
    "        adjM[movie_idx, len(movies) + len(directors) + actor_idx] = 1\n",
    "        adjM[len(movies) + len(directors) + actor_idx, movie_idx] = 1"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "outputs": [],
   "source": [
    "# extract bag-of-word representations of plot keywords for each movie\n",
    "# X is a sparse matrix\n",
    "vectorizer = CountVectorizer(min_df=2)\n",
    "movie_X = vectorizer.fit_transform(movies['plot_keywords'].fillna('').values)\n",
    "# assign features to directors and actors as the means of their associated movies' features\n",
    "adjM_da2m = adjM[len(movies):, :len(movies)]\n",
    "adjM_da2m_normalized = np.diag(1 / adjM_da2m.sum(axis=1)).dot(adjM_da2m)\n",
    "director_actor_X = scipy.sparse.csr_matrix(adjM_da2m_normalized).dot(movie_X)\n",
    "full_X = scipy.sparse.vstack([movie_X, director_actor_X])"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "outputs": [
    {
     "name": "stdout",
     "text": [
      "(17446, 3)\n(95102, 3)",
      "\n",
      "(4278, 3)\n",
      "(95102, 5)\n",
      "(38476, 3)\n",
      "(156928, 5)\n"
     ],
     "output_type": "stream"
    }
   ],
   "source": [
    "expected_metapaths = [\n",
    "    [(0, 1, 0), (0, 2, 0)],\n",
    "    [(1, 0, 1), (1, 0, 2, 0, 1)],\n",
    "    [(2, 0, 2), (2, 0, 1, 0, 2)]\n",
    "]\n",
    "# create the directories if they do not exist\n",
    "for i in range(num_ntypes):\n",
    "    pathlib.Path(save_prefix + '{}'.format(i)).mkdir(parents=True, exist_ok=True)\n",
    "for i in range(num_ntypes):\n",
    "    # get metapath based neighbor pairs\n",
    "    neighbor_pairs = utils.preprocess.get_metapath_neighbor_pairs(adjM, type_mask, expected_metapaths[i])\n",
    "    # construct and save metapath-based networks\n",
    "    G_list = utils.preprocess.get_networkx_graph(neighbor_pairs, type_mask, i)\n",
    "    \n",
    "    # save data\n",
    "    # networkx graph (metapath specific)\n",
    "    for G, metapath in zip(G_list, expected_metapaths[i]):\n",
    "        nx.write_adjlist(G, save_prefix + '{}/'.format(i) + '-'.join(map(str, metapath)) + '.adjlist')\n",
    "    # node indices of edge metapaths\n",
    "    all_edge_metapath_idx_array = utils.preprocess.get_edge_metapath_idx_array(neighbor_pairs)\n",
    "    for metapath, edge_metapath_idx_array in zip(expected_metapaths[i], all_edge_metapath_idx_array):\n",
    "        np.save(save_prefix + '{}/'.format(i) + '-'.join(map(str, metapath)) + '_idx.npy', edge_metapath_idx_array)\n",
    "\n",
    "# save data\n",
    "# all nodes adjacency matrix\n",
    "scipy.sparse.save_npz(save_prefix + 'adjM.npz', scipy.sparse.csr_matrix(adjM))\n",
    "# all nodes (movies, directors and actors) features\n",
    "for i in range(num_ntypes):\n",
    "    scipy.sparse.save_npz(save_prefix + 'features_{}.npz'.format(i), full_X[np.where(type_mask == i)[0]])\n",
    "# all nodes (movies, directors and actors) type labels\n",
    "np.save(save_prefix + 'node_types.npy', type_mask)\n",
    "# movie genre labels\n",
    "np.save(save_prefix + 'labels.npy', labels)\n",
    "# movie train/validation/test splits\n",
    "rand_seed = 1566911444\n",
    "train_idx, val_idx = train_test_split(np.arange(len(labels)), test_size=400, random_state=rand_seed)\n",
    "train_idx, test_idx = train_test_split(train_idx, test_size=3478, random_state=rand_seed)\n",
    "train_idx.sort()\n",
    "val_idx.sort()\n",
    "test_idx.sort()\n",
    "np.savez(save_prefix + 'train_val_test_idx.npz',\n",
    "         val_idx=val_idx,\n",
    "         train_idx=train_idx,\n",
    "         test_idx=test_idx)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  }
 ],
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  },
  "kernelspec": {
   "name": "python3",
   "language": "python",
   "display_name": "Python 3"
  },
  "pycharm": {
   "stem_cell": {
    "cell_type": "raw",
    "source": [],
    "metadata": {
     "collapsed": false
    }
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}